library(bnlearn)
library(lattice)

load("AllPapers.RData") # this loads the data frame called "df"
df<-pap


#recodings needed to make the different analyses run (dont add/change any information in the data)
pap$discipline <- revalue(pap$area, c( "Biomedicine &\nhealth sciences"="HS",
                                       "Life sciences"="LS",
                                       "Physical sciences"="PS",
                                       "Soc. sciences\n& humanities"="SS"
))
pap$final.decision <- ifelse(pap$FinalAccept, "Reject","Accept")
pap$ord.decision <- revalue(as.character(pap$decRec), c("1" = "Accept",
                                                        "2"="Minor revisions",
                                                        "3"="Major revisions",
                                                        "4"="Reject Major"))
pap$ediRej <- ifelse(pap$decRec == 4, 1, -1)
#data cleaning
conditions <- !is.na(df$IFRounded)& !is.na(df$autRatFem) & !is.na(df$revRatFem) & !is.na(df$ediRej) & df$rev.score != "NaN"
df <- subset(df, conditions)

#These variables correspond to nodes in the Bayesian network
allVariables <- c("nAuthors","autRatFem","IFRounded","rev.score","ediRej",
                  "discipline","revRatFem","prType","agreement","nRev")
df <- df[allVariables]
df
df$prType <- ifelse(df$prType == "Single Blind",1,0)
df$discipline <- as.factor(df$discipline)
#Dividing the data into training and test data
set.seed(1234)
sampleSize <- floor(0.8 * nrow(df))
indexes <- sample(1:nrow(df), size = sampleSize)
train <- df[indexes, ]
test <- df[-indexes, ]

#To speed up the structure finding algorithm and because this is the only direction of causality that makes sense,
#we do not allow for the review score or editorial decision to affect properties of papers.
blackList <-matrix(c("rev.score", "nAuthors" 
                     ,"rev.score", "autRatFem"
                     ,"rev.score", "discipline"
                     ,"rev.score", "IFRounded"
                     ,"rev.score", "revRatFem"
                     ,"rev.score", "prType"
                     ,"rev.score", "agreement"
                     ,"rev.score", "nRev"
                     ,"ediRej", "rev.score"
                     ,"ediRej", "nAuthors"
                     ,"ediRej", "autRatFem"
                     ,"ediRej", "discipline"
                     ,"ediRej", "IFRounded"
                     ,"ediRej", "revRatFem"
                     ,"ediRej", "prType"
                     ,"ediRej", "agreement"
                     ,"ediRej", "nRev"), ncol = 2, byrow = TRUE, dimnames = list(NULL, c("from", "to")))
blackListNoDis <-matrix(c("rev.score", "nAuthors" 
                     ,"rev.score", "autRatFem"
                     ,"rev.score", "IFRounded"
                     ,"rev.score", "revRatFem"
                     ,"rev.score", "prType"
                     ,"rev.score", "agreement"
                     ,"rev.score", "nRev"
                     ,"ediRej", "rev.score"
                     ,"ediRej", "nAuthors"
                     ,"ediRej", "autRatFem"
                     ,"ediRej", "IFRounded"
                     ,"ediRej", "revRatFem"
                     ,"ediRej", "prType"
                     ,"ediRej", "agreement"
                     ,"ediRej", "nRev"
                     , "nAuthors", "prType"
                     , "agreement", "prType"
                     , "autRatFem", "prType"
                     , "nRev", "prType"
                     , "nRev", "nAuthors"), ncol = 2, byrow = TRUE, dimnames = list(NULL, c("from", "to")))
learnedNetworkHS <- mmhc(test[,-6], blacklist = blackListNoDis)
learnedNetworkLS <- mmhc(subset(df, df$discipline == "LS")[,-6], blacklist = blackListNoDis)
learnedNetworkPS <- mmhc(subset(df, df$discipline == "PS")[,-6], blacklist = blackListNoDis)
learnedNetworkSS <- mmhc(subset(df, df$discipline == "SS")[,-6], blacklist = blackListNoDis)

learnedNetwork <- iamb(train, blacklist = blackList)

#learning weights/coefficients
fittedbn <- bn.fit(learnedNetwork, data = df )
fittedbnHS <- bn.fit(learnedNetworkHS, data = subset(df, df$discipline == "HS")[,-6] )
fittedbnLS <- bn.fit(learnedNetworkLS, data = subset(df, df$discipline == "LS")[,-6] )
fittedbnPS <- bn.fit(learnedNetworkPS, data = subset(df, df$discipline == "PS")[,-6] )
fittedbnSS <- bn.fit(learnedNetworkSS, data = subset(df, df$discipline == "SS")[,-6] )
Coef <- coefficients(fittedbn)
CoefHS <- coefficients(fittedbnHS)
CoefLS <- coefficients(fittedbnLS)
CoefPS <- coefficients(fittedbnPS)
CoefSS <- coefficients(fittedbnSS)
plot(learnedNetworkHS)

#predicting rejections in the test data using the learned Bayesian network
predicted <- predict(fittedbn,node="ediRej",data=bnTest)
predicted <- ifelse(predicted <= 0, -1, 1) #the predictions of the network are made binary
correct <- sum(predicted == bnTest$ediRej)/length(predicted)

#the rest of the code predicts rejection rates for men and women seperately conditional on 
#all possible review scores and plots these conditional rejection rates
a<-data.frame(0,0,0)
for(i in seq(0.05,1.0,0.05)) 
{
  lowWomen <- cpquery(fittedbn, event = (ediRej < 0), evidence = ((autRatFem <= 0) & (rev.score < i)& (rev.score > (i - 0.1))), n=999999) 
  highWomen <- cpquery(fittedbn, event = (ediRej < 0), evidence = ((autRatFem >= 1) & (rev.score < i)& (rev.score > (i - 0.1))), n=999999)
  a[i*20+1,] <- c(i,lowWomen,highWomen)
}

autFemRat <- c(rep("all men",length(a[,1])),rep("all women",length(a[,1])))
notRejected <- c(a$X0.1,a$X0.2)
reviewScore <-rep(seq(0.0,1.0,0.05),2)

xyplot(reviewScore ~ notRejected,groups = autFemRat,type="l",
                  auto.key = list( points=F, lines=T,corner = c(0.9, 0.2) ), ylab="review score", xlab="Probablility of no rejection",
                  ylim =c(0,1), xlim =c(0,1),par.settings=list(superpose.line =list(col=c("red","blue"))), grid=T)